import os
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import warnings
warnings.filterwarnings("ignore")
#Importing the dataset in the notebook
data = pd.read_csv('../../Spotify Music Dataset/data/data.csv')
genre_data = pd.read_csv('../../Spotify Music Dataset/data_by_genres.csv')
year_data = pd.read_csv('../../Spotify Music Dataset/data_by_year.csv')
data.describe()
| valence | year | acousticness | danceability | duration_ms | energy | explicit | instrumentalness | key | liveness | loudness | mode | popularity | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 1.706530e+05 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 |
| mean | 0.528587 | 1976.787241 | 0.502115 | 0.537396 | 2.309483e+05 | 0.482389 | 0.084575 | 0.167010 | 5.199844 | 0.205839 | -11.467990 | 0.706902 | 31.431794 | 0.098393 | 116.861590 |
| std | 0.263171 | 25.917853 | 0.376032 | 0.176138 | 1.261184e+05 | 0.267646 | 0.278249 | 0.313475 | 3.515094 | 0.174805 | 5.697943 | 0.455184 | 21.826615 | 0.162740 | 30.708533 |
| min | 0.000000 | 1921.000000 | 0.000000 | 0.000000 | 5.108000e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -60.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.317000 | 1956.000000 | 0.102000 | 0.415000 | 1.698270e+05 | 0.255000 | 0.000000 | 0.000000 | 2.000000 | 0.098800 | -14.615000 | 0.000000 | 11.000000 | 0.034900 | 93.421000 |
| 50% | 0.540000 | 1977.000000 | 0.516000 | 0.548000 | 2.074670e+05 | 0.471000 | 0.000000 | 0.000216 | 5.000000 | 0.136000 | -10.580000 | 1.000000 | 33.000000 | 0.045000 | 114.729000 |
| 75% | 0.747000 | 1999.000000 | 0.893000 | 0.668000 | 2.624000e+05 | 0.703000 | 0.000000 | 0.102000 | 8.000000 | 0.261000 | -7.183000 | 1.000000 | 48.000000 | 0.075600 | 135.537000 |
| max | 1.000000 | 2020.000000 | 0.996000 | 0.988000 | 5.403500e+06 | 1.000000 | 1.000000 | 1.000000 | 11.000000 | 1.000000 | 3.855000 | 1.000000 | 100.000000 | 0.970000 | 243.507000 |
year_data.head()
| mode | year | acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | popularity | key | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1921 | 0.886896 | 0.418597 | 260537.166667 | 0.231815 | 0.344878 | 0.205710 | -17.048667 | 0.073662 | 101.531493 | 0.379327 | 0.653333 | 2 |
| 1 | 1 | 1922 | 0.938592 | 0.482042 | 165469.746479 | 0.237815 | 0.434195 | 0.240720 | -19.275282 | 0.116655 | 100.884521 | 0.535549 | 0.140845 | 10 |
| 2 | 1 | 1923 | 0.957247 | 0.577341 | 177942.362162 | 0.262406 | 0.371733 | 0.227462 | -14.129211 | 0.093949 | 114.010730 | 0.625492 | 5.389189 | 0 |
| 3 | 1 | 1924 | 0.940200 | 0.549894 | 191046.707627 | 0.344347 | 0.581701 | 0.235219 | -14.231343 | 0.092089 | 120.689572 | 0.663725 | 0.661017 | 10 |
| 4 | 1 | 1925 | 0.962607 | 0.573863 | 184986.924460 | 0.278594 | 0.418297 | 0.237668 | -14.146414 | 0.111918 | 115.521921 | 0.621929 | 2.604317 | 5 |
genre_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2973 entries, 0 to 2972 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mode 2973 non-null int64 1 genres 2973 non-null object 2 acousticness 2973 non-null float64 3 danceability 2973 non-null float64 4 duration_ms 2973 non-null float64 5 energy 2973 non-null float64 6 instrumentalness 2973 non-null float64 7 liveness 2973 non-null float64 8 loudness 2973 non-null float64 9 speechiness 2973 non-null float64 10 tempo 2973 non-null float64 11 valence 2973 non-null float64 12 popularity 2973 non-null float64 13 key 2973 non-null int64 dtypes: float64(11), int64(2), object(1) memory usage: 325.3+ KB
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 170653 entries, 0 to 170652 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 valence 170653 non-null float64 1 year 170653 non-null int64 2 acousticness 170653 non-null float64 3 artists 170653 non-null object 4 danceability 170653 non-null float64 5 duration_ms 170653 non-null int64 6 energy 170653 non-null float64 7 explicit 170653 non-null int64 8 id 170653 non-null object 9 instrumentalness 170653 non-null float64 10 key 170653 non-null int64 11 liveness 170653 non-null float64 12 loudness 170653 non-null float64 13 mode 170653 non-null int64 14 name 170653 non-null object 15 popularity 170653 non-null int64 16 release_date 170653 non-null object 17 speechiness 170653 non-null float64 18 tempo 170653 non-null float64 dtypes: float64(9), int64(6), object(4) memory usage: 24.7+ MB
data['duration_ms'] = data['duration_ms'].apply(lambda x:x/60000).round(1)
data.rename(columns={'duration_ms':'duration_min'},inplace = True)
data.head()
| valence | year | acousticness | artists | danceability | duration_min | energy | explicit | id | instrumentalness | key | liveness | loudness | mode | name | popularity | release_date | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0594 | 1921 | 0.982 | ['Sergei Rachmaninoff', 'James Levine', 'Berli... | 0.279 | 13.9 | 0.211 | 0 | 4BJqT0PrAfrxzMOxytFOIz | 0.878000 | 10 | 0.665 | -20.096 | 1 | Piano Concerto No. 3 in D Minor, Op. 30: III. ... | 4 | 1921 | 0.0366 | 80.954 |
| 1 | 0.9630 | 1921 | 0.732 | ['Dennis Day'] | 0.819 | 3.0 | 0.341 | 0 | 7xPhfUan2yNtyFG0cUWkt8 | 0.000000 | 7 | 0.160 | -12.441 | 1 | Clancy Lowered the Boom | 5 | 1921 | 0.4150 | 60.936 |
| 2 | 0.0394 | 1921 | 0.961 | ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... | 0.328 | 8.3 | 0.166 | 0 | 1o6I8BglA6ylDMrIELygv1 | 0.913000 | 3 | 0.101 | -14.850 | 1 | Gati Bali | 5 | 1921 | 0.0339 | 110.339 |
| 3 | 0.1650 | 1921 | 0.967 | ['Frank Parker'] | 0.275 | 3.5 | 0.309 | 0 | 3ftBPsC5vPBKxYSee08FDH | 0.000028 | 5 | 0.381 | -9.316 | 1 | Danny Boy | 3 | 1921 | 0.0354 | 100.109 |
| 4 | 0.2530 | 1921 | 0.957 | ['Phil Regan'] | 0.418 | 2.8 | 0.193 | 0 | 4d6HGyGT8e121BsdKmw9v6 | 0.000002 | 3 | 0.229 | -10.096 | 1 | When Irish Eyes Are Smiling | 2 | 1921 | 0.0380 | 101.665 |
data = data.drop(columns=['id','popularity','release_date','mode'])
data.head()
| valence | year | acousticness | artists | danceability | duration_min | energy | explicit | instrumentalness | key | liveness | loudness | name | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0594 | 1921 | 0.982 | ['Sergei Rachmaninoff', 'James Levine', 'Berli... | 0.279 | 13.9 | 0.211 | 0 | 0.878000 | 10 | 0.665 | -20.096 | Piano Concerto No. 3 in D Minor, Op. 30: III. ... | 0.0366 | 80.954 |
| 1 | 0.9630 | 1921 | 0.732 | ['Dennis Day'] | 0.819 | 3.0 | 0.341 | 0 | 0.000000 | 7 | 0.160 | -12.441 | Clancy Lowered the Boom | 0.4150 | 60.936 |
| 2 | 0.0394 | 1921 | 0.961 | ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... | 0.328 | 8.3 | 0.166 | 0 | 0.913000 | 3 | 0.101 | -14.850 | Gati Bali | 0.0339 | 110.339 |
| 3 | 0.1650 | 1921 | 0.967 | ['Frank Parker'] | 0.275 | 3.5 | 0.309 | 0 | 0.000028 | 5 | 0.381 | -9.316 | Danny Boy | 0.0354 | 100.109 |
| 4 | 0.2530 | 1921 | 0.957 | ['Phil Regan'] | 0.418 | 2.8 | 0.193 | 0 | 0.000002 | 3 | 0.229 | -10.096 | When Irish Eyes Are Smiling | 0.0380 | 101.665 |
data.isnull().sum()
valence 0 year 0 acousticness 0 artists 0 danceability 0 duration_min 0 energy 0 explicit 0 instrumentalness 0 key 0 liveness 0 loudness 0 name 0 speechiness 0 tempo 0 dtype: int64
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', KMeans(n_clusters=10))])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
genre_data['cluster'] = cluster_pipeline.predict(X)
from sklearn.manifold import TSNE
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 2973 samples in 0.005s... [t-SNE] Computed neighbors for 2973 samples in 0.409s... [t-SNE] Computed conditional probabilities for sample 1000 / 2973 [t-SNE] Computed conditional probabilities for sample 2000 / 2973 [t-SNE] Computed conditional probabilities for sample 2973 / 2973 [t-SNE] Mean sigma: 0.777516 [t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106087 [t-SNE] KL divergence after 1000 iterations: 1.391694
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20,
verbose=False))
], verbose=False)
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()